{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "960281b9-34a5-40bd-ad56-f6e83aed2882",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas as pd\n",
    "import sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "83d7406e-3be7-4e6b-9c68-da7f67b82647",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.5.0'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sklearn.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ef5cbf9e-f962-4d60-8daf-e6f4a791e3bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "year = 2023\n",
    "month = 3\n",
    "\n",
    "input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'\n",
    "output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "007cfab3-f7a1-4503-88a4-0006162b8d26",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7836ccfd",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('model.bin', 'rb') as f_in:\n",
    "    dv, lr = pickle.load(f_in)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "41c08294",
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical = ['PULocationID', 'DOLocationID']\n",
    "\n",
    "def read_data(filename):\n",
    "    df = pd.read_parquet(filename)\n",
    "    \n",
    "    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime\n",
    "    df['duration'] = df.duration.dt.total_seconds() / 60\n",
    "\n",
    "    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()\n",
    "\n",
    "    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3a1d318c-c4b7-4df8-be68-1ec42d312f22",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = read_data(input_file)\n",
    "df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "9dd40a24-07ce-4863-b4ba-d923de41f9fc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>VendorID</th>\n",
       "      <th>tpep_pickup_datetime</th>\n",
       "      <th>tpep_dropoff_datetime</th>\n",
       "      <th>passenger_count</th>\n",
       "      <th>trip_distance</th>\n",
       "      <th>RatecodeID</th>\n",
       "      <th>store_and_fwd_flag</th>\n",
       "      <th>PULocationID</th>\n",
       "      <th>DOLocationID</th>\n",
       "      <th>payment_type</th>\n",
       "      <th>...</th>\n",
       "      <th>extra</th>\n",
       "      <th>mta_tax</th>\n",
       "      <th>tip_amount</th>\n",
       "      <th>tolls_amount</th>\n",
       "      <th>improvement_surcharge</th>\n",
       "      <th>total_amount</th>\n",
       "      <th>congestion_surcharge</th>\n",
       "      <th>Airport_fee</th>\n",
       "      <th>duration</th>\n",
       "      <th>ride_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>2023-03-01 00:06:43</td>\n",
       "      <td>2023-03-01 00:16:43</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>N</td>\n",
       "      <td>238</td>\n",
       "      <td>42</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11.10</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>2023/03_0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2023-03-01 00:08:25</td>\n",
       "      <td>2023-03-01 00:39:30</td>\n",
       "      <td>2.0</td>\n",
       "      <td>12.40</td>\n",
       "      <td>1.0</td>\n",
       "      <td>N</td>\n",
       "      <td>138</td>\n",
       "      <td>231</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>6.0</td>\n",
       "      <td>0.5</td>\n",
       "      <td>12.54</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>76.49</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1.25</td>\n",
       "      <td>31.083333</td>\n",
       "      <td>2023/03_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>2023-03-01 00:15:04</td>\n",
       "      <td>2023-03-01 00:29:26</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.30</td>\n",
       "      <td>1.0</td>\n",
       "      <td>N</td>\n",
       "      <td>140</td>\n",
       "      <td>186</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>3.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>4.65</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>28.05</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.00</td>\n",
       "      <td>14.366667</td>\n",
       "      <td>2023/03_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>2023-03-01 00:49:37</td>\n",
       "      <td>2023-03-01 01:01:05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.90</td>\n",
       "      <td>1.0</td>\n",
       "      <td>N</td>\n",
       "      <td>140</td>\n",
       "      <td>43</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>3.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>4.10</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>24.70</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.00</td>\n",
       "      <td>11.466667</td>\n",
       "      <td>2023/03_3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>2023-03-01 00:08:04</td>\n",
       "      <td>2023-03-01 00:11:06</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.23</td>\n",
       "      <td>1.0</td>\n",
       "      <td>N</td>\n",
       "      <td>79</td>\n",
       "      <td>137</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.5</td>\n",
       "      <td>2.44</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>14.64</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.00</td>\n",
       "      <td>3.033333</td>\n",
       "      <td>2023/03_4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \\\n",
       "0         2  2023-03-01 00:06:43   2023-03-01 00:16:43              1.0   \n",
       "1         2  2023-03-01 00:08:25   2023-03-01 00:39:30              2.0   \n",
       "2         1  2023-03-01 00:15:04   2023-03-01 00:29:26              0.0   \n",
       "3         1  2023-03-01 00:49:37   2023-03-01 01:01:05              1.0   \n",
       "4         2  2023-03-01 00:08:04   2023-03-01 00:11:06              1.0   \n",
       "\n",
       "   trip_distance  RatecodeID store_and_fwd_flag PULocationID DOLocationID  \\\n",
       "0           0.00         1.0                  N          238           42   \n",
       "1          12.40         1.0                  N          138          231   \n",
       "2           3.30         1.0                  N          140          186   \n",
       "3           2.90         1.0                  N          140           43   \n",
       "4           1.23         1.0                  N           79          137   \n",
       "\n",
       "   payment_type  ...  extra  mta_tax  tip_amount  tolls_amount  \\\n",
       "0             2  ...    1.0      0.5        0.00           0.0   \n",
       "1             1  ...    6.0      0.5       12.54           0.0   \n",
       "2             1  ...    3.5      0.5        4.65           0.0   \n",
       "3             1  ...    3.5      0.5        4.10           0.0   \n",
       "4             1  ...    1.0      0.5        2.44           0.0   \n",
       "\n",
       "   improvement_surcharge  total_amount  congestion_surcharge  Airport_fee  \\\n",
       "0                    1.0         11.10                   0.0         0.00   \n",
       "1                    1.0         76.49                   2.5         1.25   \n",
       "2                    1.0         28.05                   2.5         0.00   \n",
       "3                    1.0         24.70                   2.5         0.00   \n",
       "4                    1.0         14.64                   2.5         0.00   \n",
       "\n",
       "    duration    ride_id  \n",
       "0  10.000000  2023/03_0  \n",
       "1  31.083333  2023/03_1  \n",
       "2  14.366667  2023/03_2  \n",
       "3  11.466667  2023/03_3  \n",
       "4   3.033333  2023/03_4  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "669fda0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "dicts = df[categorical].to_dict(orient='records')\n",
    "X_val = dv.transform(dicts)\n",
    "y_pred = lr.predict(X_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "64aadfcb-d82b-4d7b-a40e-c8373d0c3330",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([16.24590642, 26.1347962 , 11.88426424, ..., 11.59533603,\n",
       "       13.11317847, 12.89999218])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "824811df-af93-476b-8f22-68f878b3ba67",
   "metadata": {},
   "source": [
    "### Q1. Standard deviation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "d8c487b9-d630-424f-ac3e-7ca31f2bed44",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6.247488852238703"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred.std()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "079be83b-17bb-4c71-b15f-336673dee2ca",
   "metadata": {},
   "source": [
    "### Q2. Preparing the output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "92599455-2191-4ed7-afa3-98f510a41e1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_result = pd.DataFrame()\n",
    "df_result['ride_id'] = df['ride_id']\n",
    "df_result['predicted_duration'] = y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "353c91e8-bdf7-4f13-a28b-586fd595d0b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_result.to_parquet(\n",
    "    output_file,\n",
    "    engine='pyarrow',\n",
    "    compression=None,\n",
    "    index=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d1b9a4df-ea69-4e7a-a41c-4c19f24820d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 66M\n",
      "-rw-rw-rw- 1 codespace codespace 66M Jun  7 19:28 yellow_tripdata_2023-03.parquet\n"
     ]
    }
   ],
   "source": [
    "!ls -lh output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "982b2006-fd32-440b-a27c-e0f843b53ed1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
